In [1]:
    
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import PolynomialFeatures
    
In [2]:
    
# Function to generate target value for a given x.
true_func = lambda X: np.cos(1.5 * np.pi * X)
    
In [3]:
    
np.random.seed(0)
# Training Set: No. of random samples used for training the model
n_samples = 30
x = np.sort(np.random.rand(n_samples))
y = true_func(x) + np.random.randn(n_samples) * 0.1
# Test Set: 100 samples for which we want the model to predict value
n_test = 100
x_test = np.linspace(0, 1, n_test)
y_test_actual = true_func(x_test) + np.random.randn(n_test) * 0.1
    
In [4]:
    
x[:5]
    
    Out[4]:
In [5]:
    
x[:5],y[:5]
    
    Out[5]:
In [6]:
    
# Function to add more features from existing features
# in this case degree is the desired order of polynomials
# Example degree 3 with 1 feature x would output: x,x^2,x^3
# Similary, if there are multiple features x1,x2: x1,x2,x1*x2,x1^2,x1^2*x2,x1^3....and so forth
def generate_higher_order(degrees, x):
    # Generate higher order features from a given set of features.
    poly = PolynomialFeatures(degree = degrees, 
                              include_bias = False)
    x_new = poly.fit_transform(x)
    
    df = pd.DataFrame(x_new)
    df.columns = df.columns.map(lambda n: 'x' + str(n))
    return df
    
In [7]:
    
data_path = r'..\Data\RegressionExamples\under_over_fit_30samples'
    
In [8]:
    
# degrees for feature generation
degrees = [1, 4, 15]
    
In [9]:
    
# Generate training set for each of the degree
for d in degrees:
    df = generate_higher_order(d, x.reshape((n_samples, 1)))
    df['y'] = y
    df.to_csv(os.path.join(data_path,'fit_degree_{0}_example_train{1}.csv'.format(d,n_samples)),
              index = True,
              index_label = 'Row')
    
In [10]:
    
# Generate Evaluation set.  Contains all the features and target.
# Generate Test set. Contains only the features.  AWSML would predict the target
for d in degrees:
    df = generate_higher_order(d, x_test.reshape((n_test, 1)))
    
    df.to_csv(os.path.join(data_path,'fit_degree_{0}_example_test{1}.csv'.format(d, n_samples)),
              index = True,
              index_label = 'Row')
    
    df['y'] = y_test_actual
    
    df.to_csv(os.path.join(data_path,'fit_degree_{0}_example_eval{1}.csv'.format(d, n_samples)),
              index = True,
              index_label = 'Row')
    
In [11]:
    
# Pull Predictions
df_samples = pd.read_csv(os.path.join(data_path, 'fit_degree_1_example_train30.csv'), 
                         index_col = 'Row')
df_actual = pd.read_csv(os.path.join(data_path, 'fit_degree_1_example_eval30.csv'), 
                        index_col = 'Row')
df_d1_predicted = pd.read_csv(
    os.path.join(data_path,'output_deg_1',
    'bp-aYBztCIPIdb-fit_degree_1_example_test30.csv.gz'))
df_d1_predicted.columns = ["Row","y_predicted"]
    
In [12]:
    
fig = plt.figure(figsize = (12, 8))
plt.scatter(x = df_samples['x0'],
            y = df_samples['y'],
            color = 'b',
            label='samples')
plt.scatter(x = df_actual['x0'],
            y = df_actual['y'],
            color = 'r',
            label = 'true function')
plt.scatter(x = df_actual['x0'],
            y = df_d1_predicted['y_predicted'],
            color = 'g',
            label = 'predicted with degree 1')
plt.title('Model with degree 1 feature - Underfit')
plt.grid(True)
plt.legend()
    
    Out[12]:
    
Polynomial with degree 1 is a straight line - Underfitting
Training RMSE:0.5063, Evaluation RMSE:0.4308, Baseline RMSE:0.689
In [13]:
    
fig = plt.figure(figsize = (12, 8))
plt.boxplot([df_actual['y'],
             df_d1_predicted['y_predicted']], 
            labels=['actual','predicted with deg1'])
plt.title('Box Plot - Actual, Predicted')
plt.ylabel('Target Attribute')
plt.grid(True)
    
    
In [14]:
    
df_d4_predicted = pd.read_csv(
    os.path.join(data_path,'output_deg_4',
    'bp-W4oBOhwClbH-fit_degree_4_example_test30.csv.gz'))
df_d4_predicted.columns = ["Row","y_predicted"]
    
In [15]:
    
fig = plt.figure(figsize = (12, 8))
plt.scatter(x = df_samples['x0'], 
            y = df_samples['y'],
            color = 'b',
            label = 'samples')
plt.scatter(x = df_actual['x0'],
            y = df_actual['y'],
            color = 'r',
            label = 'true function')
plt.scatter(x = df_actual['x0'],
            y = df_d4_predicted['y_predicted'],
            color = 'g',
            label = 'predicted with degree 4')
plt.title('Model with degree 4 features - normal fit')
plt.grid(True)
plt.legend()
    
    Out[15]:
    
Good Fit with degree 4 polynomial
Training RMSE:0.2563, Evaluation RMSE:0.1493, Baseline RMSE:0.689
In [16]:
    
fig = plt.figure(figsize = (12, 8))
plt.boxplot([df_actual['y'],
             df_d1_predicted['y_predicted'],
             df_d4_predicted['y_predicted']], 
            labels=['actual','predicted with deg1','predicted with deg4'])
plt.title('Box Plot - Actual, Predicted')
plt.ylabel('Target Attribute')
plt.grid(True)
    
    
In [17]:
    
df_d15_predicted = pd.read_csv(
    os.path.join(data_path,'output_deg_15',
    'bp-rBWxcnPN3zu-fit_degree_15_example_test30.csv.gz'))
df_d15_predicted.columns = ["Row","y_predicted"]
    
In [18]:
    
fig = plt.figure(figsize = (12, 8))
plt.scatter(x = df_samples['x0'],
            y = df_samples['y'],
            color = 'b',
            label = 'samples')
plt.scatter(x = df_actual['x0'],
            y = df_actual['y'],
            color = 'r',
            label = 'true function')
plt.scatter(x = df_actual['x0'],
            y = df_d15_predicted['y_predicted'],
            color = 'g',
            label = 'predicted with degree 15')
plt.grid(True)
plt.legend()
    
    Out[18]:
    
Not quite over fitting as shown in sci-kit example; fit is actually pretty good here.
Training RMSE:0.2984, Evaluation RMSE:0.1222, Baseline RMSE:0.689
In [19]:
    
fig = plt.figure(figsize = (12, 8))
plt.boxplot([df_actual['y'],
             df_d1_predicted['y_predicted'],
             df_d4_predicted['y_predicted'],
             df_d15_predicted['y_predicted']], 
            labels = ['actual','predicted deg1','predicted deg4','predicted deg15'])
plt.title('Box Plot - Actual, Predicted')
plt.ylabel('Target Attribute')
plt.grid(True)
    
    
To add polynomial features that combines all input features, use sci-kit module library. Anaconda includes these modules by default.
We saw good performance with degree 4 and any additional feature may bring incremental improvement, but with added complexity of managing features.